In [43]:
class color:
    PURPLE = '\033[95m'
    CYAN = '\033[96m'
    DARKCYAN = '\033[36m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'
    END = '\033[0m'

from IPython.display import display, Audio
from pathlib import Path
import librosa
    
from scipy.signal import butter, lfilter

def butter_bandpass(lowcut, highcut, fs, order=5):
    nyq = 0.5 * fs
    low = lowcut / nyq
    high = highcut / nyq
    b, a = butter(order, [low, high], btype='band')
    return b, a


def butter_bandpass_filter(data, lowcut, highcut, fs, order=5):
    b, a = butter_bandpass(lowcut, highcut, fs, order=order)
    y = lfilter(b, a, data)
    return y


wav_files = Path('/data/yinjyun/projects/VocalVAE-RNN-update/results_final/model-new_att-True_techCond-none_bi-True_adv-False_trainVar-False_refine-True_et-rnn_drnn-True_chunkCE-True_seqCE-True_var--2_None/')
tech_dir = 'demo_paper-tech-3'
sid_dir = 'demo_paper-sid-1'

Singing Voice Conversion with Disentangled Representations of Singer and Vocal Technique Using Variational Autoencoders

Yin-Jyun Luo$^{1, 3}$, Chin-Cheng Hsu$^{2}$, Kat Agres$^{3,4}$, Dorien Herremans$^{1,3}$

$^{1}$Singapore University of Technology and Design
$^{2}$University of Southern California
$^{3}$Institute of High Performance Computing, A*STAR, Singapore
$^{4}$Yong Siew Toh Conservatory of Music, National University of Singapore
$\tt yinjyun\_luo@mymail.sutd.edu.sg$

Many-to-Many Singing Technique Conversion

We focus on singing technique conversion in this demo page.
The following are the audio samples of Fig. 2(b) in the paper.

The audio files below are all converted from Mel-spectrograms using Griffin-Lim.
Therefore, the audio "original Mel-spectrogram" are the upper bounds of the audio quality for each conversion.

We first give examples of each singing technique

Again, these samples are obtained by inverting from their Mel-spectrograms to audio

In [53]:
sr = 22050
low=150
high=2300

print("f1 straight")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-f1_scales_straight_o.wav')
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("m6 belt")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-m6_arpeggios_belt_e.wav')
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("f6 breathy")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-f6_scales_breathy_a.wav')
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("f9 lip trill")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-f9_scales_lip_trill_e.wav')
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("m1 vibrato")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-m1_arpeggios_vibrato_e.wav')
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("m3 vocal fry")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-m3_arpeggios_vocal_fry_u.wav')
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
f1 straight
m6 belt
f6 breathy
f9 lip trill
m1 vibrato
m3 vocal fry

The first row of Fig. 2(b) in the paper

In [54]:
sr = 22050
low=150
high=2300



print(color.BOLD + "m8 lip trill" + color.END)

print("Original Mel-spectrogram")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-m8_arpeggios_lip_trill_i.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Reconstructed Mel-spectrogram")
x2, _ = librosa.core.load(wav_files / tech_dir / 'lip_trill-m8_arpeggios_lip_trill_i.wav')
# display(Audio(x2, rate=sr))
display(Audio(butter_bandpass_filter(x2, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to belt")
x, _ = librosa.core.load(wav_files / tech_dir / 'belt-m8_arpeggios_lip_trill_i.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))


print("Convert to breathy")
x, _ = librosa.core.load(wav_files / tech_dir / 'breathy-m8_arpeggios_lip_trill_i.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))


print("Convert to vibrato")
x, _ = librosa.core.load(wav_files / tech_dir / 'vibrato-m8_arpeggios_lip_trill_i.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))


print("Convert to vocal fry")
x, _ = librosa.core.load(wav_files / tech_dir / 'vocal_fry-m8_arpeggios_lip_trill_i.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
m8 lip trill
Original Mel-spectrogram
Reconstructed Mel-spectrogram
Convert to belt
Convert to breathy
Convert to vibrato
Convert to vocal fry

The second row of Fig. 2(b) in the paper

In [55]:
sr = 22050
low=150
high=2300

print(color.BOLD + "m9 straight" + color.END)

print("Original Mel-spectrogram")
x, _ = librosa.core.load(wav_files / tech_dir / 'source-m9_arpeggios_straight_e.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Reconstructed Mel-spectrogram")
x, _ = librosa.core.load(wav_files / tech_dir / 'straight-m9_arpeggios_straight_e.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to belt")
x, _ = librosa.core.load(wav_files / tech_dir / 'belt-m9_arpeggios_straight_e.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to breathy")
x, _ = librosa.core.load(wav_files / tech_dir / 'breathy-m9_arpeggios_straight_e.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))


print("Convert to lip trill")
x, _ = librosa.core.load(wav_files / tech_dir / 'lip_trill-m9_arpeggios_straight_e.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))


print("Convert to vibrato")
x, _ = librosa.core.load(wav_files / tech_dir / 'vibrato-m9_arpeggios_straight_e.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))


print("Convert to vocal fry")
x, _ = librosa.core.load(wav_files / tech_dir / 'vocal_fry-m9_arpeggios_straight_e.wav')
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
m9 straight
Original Mel-spectrogram
Reconstructed Mel-spectrogram
Convert to belt
Convert to breathy
Convert to lip trill
Convert to vibrato
Convert to vocal fry

Addtional samples for singing technique conversion

In [56]:
sr = 22050
low=150
high=2300

print(color.BOLD + "f4 breathy" + color.END)

wav_name = 'f4_scales_breathy_o.wav'

print("Original Mel-spectrogram")
x, _ = librosa.core.load(wav_files / tech_dir / '-'.join(['source', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Reconstructed Mel-spectrogram")
x, _ = librosa.core.load(wav_files / tech_dir / '-'.join(['breathy', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to belt")
x, _ = librosa.core.load(wav_files / tech_dir / '-'.join(['belt', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to lip trill")
x, _ = librosa.core.load(wav_files / tech_dir / '-'.join(['lip_trill', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to vibrato")
x, _ = librosa.core.load(wav_files / tech_dir / '-'.join(['vibrato', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to vocal fry")
x, _ = librosa.core.load(wav_files / tech_dir / '-'.join(['vocal_fry', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
f4 breathy
Original Mel-spectrogram
Reconstructed Mel-spectrogram
Convert to belt
Convert to lip trill
Convert to vibrato
Convert to vocal fry

We also provide samples for many-to-many singer conversion.

The first row of Fig. 2(a) in the paper

In [36]:
sr = 22050
low=150
high=2300

print(color.BOLD + "m3 belt" + color.END)

wav_name = 'm3_scales_belt_i.wav'

print("Original Mel-spectrogram")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['source', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Reconstructed Mel-spectrogram")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m3', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to f1")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f1', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to f2")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f2', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to f4")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f2', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to m4")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m4', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to m6")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m6', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

The second row of Fig. 2(b) in the paper

In [59]:
sr = 22050
low=150
high=2300

print(color.BOLD + "f4 breathy" + color.END)

wav_name = 'f4_scales_breathy_o.wav'

print("Original Mel-spectrogram")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['source', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Reconstructed Mel-spectrogram")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f4', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to f1")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f1', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to f2")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f2', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to m3")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m3', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to m4")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m4', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to m6")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m6', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
f4 breathy
Original Mel-spectrogram
Reconstructed Mel-spectrogram
Convert to f1
Convert to f2
Convert to m3
Convert to m4
Convert to m6

Addtional samples for singer connversion

In [60]:
sr = 22050
low=150
high=2300

print(color.BOLD + "f2 straight" + color.END)

wav_name = 'f2_scales_straight_e.wav'

print("Original Mel-spectrogram")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['source', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Reconstructed Mel-spectrogram")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f2', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to f1")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f1', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to f2")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['f4', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to m3")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m3', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to m4")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m4', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))

print("Convert to m6")
x, _ = librosa.core.load(wav_files / sid_dir / '-'.join(['m6', wav_name]))
# display(Audio(x, rate=sr))
display(Audio(butter_bandpass_filter(x, lowcut=low, highcut=high, fs=sr), rate=sr))
f2 straight
Original Mel-spectrogram
Reconstructed Mel-spectrogram
Convert to f1
Convert to f2
Convert to m3
Convert to m4
Convert to m6